preprocess test set to extract words and generate dictionaries
In [1]:
import pandas as pd
import numpy as np
import string
import time
from scipy.sparse import *
from scipy.io import mmwrite, mmread
import csv
from bs4 import BeautifulSoup
from nltk.tag import brill
from taggerfunctions import *
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from ast import literal_eval
import sklearn as sk
In [2]:
uselesssymbols = ['. ','\n',"'",'\"','(',')',',',';',':','?','!','&','$']
def tokenizeWords(entry):
entryselect = []
soup = BeautifulSoup(entry)
for tag in soup.find_all(["pre", "code", "a", "img"]):
tag.decompose()
entry = soup.get_text().encode('ascii', 'ignore')
for symbol in uselesssymbols:
entry = entry.replace(symbol, ' ')
entrytok = nltk.word_tokenize(entry)
entrytok = [w.lower() for w in entrytok]
return tag_pos(entrytok)
In [3]:
def tag_pos(entrytok):
entryselect = []
entrytoktag = braubt_tagger.tag(entrytok)
for tok, tag in entrytoktag:
if tag not in ('VBP', 'CC', 'CD', 'RB', 'TO', 'VB', 'DT', 'IN', 'PRP', 'VBZ', 'WDT', '-NONE-'):
try:
tok_lemmatized = lemmatizer.lemmatize(tok, get_wordnet_pos(tag))
except:
tok_lemmatized = lemmatizer.lemmatize(tok)
entryselect.append(tok_lemmatized)
return entryselect
In [4]:
from taggerfunctions import *
braubt_tagger = braubt_Tagger()
In [5]:
def getDict(fname):
dictWords = {}
with open(fname, 'r') as f:
reader = csv.reader(f)
dictWords = {rows[0]:literal_eval(rows[1]) for rows in reader}
return pd.Series(dictWords)
In [6]:
fname = 'dictKeys.csv'
dictKeys = getDict(fname)
fname = 'dictWordsBodyFull.csv'
dictWordsBody = getDict(fname)
fname = 'dictWordsTitleNew.csv'
dictWordsTitle = getDict(fname)
In [42]:
reader = pd.read_csv("Test.csv", chunksize=100000)
IdTest = []
for chunk in reader:
for idnum in chunk['Id']:
IdTest.append(idnum)
In [45]:
for idx in range(10):
dictId = {qid:qidnum for qid,qidnum in zip(IdTest[idx*200000:(idx+1)*200000], range(200000))}
invdictId = {qidnum:qid for qid,qidnum in zip(IdTest[idx*200000:(idx+1)*200000], range(200000))}
dictId = pd.Series(dictId)
fname = "dictIdTest_" + str(idx*200000) + "-" + str((idx+1)*200000) + ".csv"
dictId.to_csv(fname)
invdictId = pd.Series(invdictId)
fname = "invdictIdTest_" + str(idx*200000) + "-" + str((idx+1)*200000) + ".csv"
invdictId.to_csv(fname)
In [46]:
dictId = {qid:qidnum for qid,qidnum in zip(IdTest[2000000:], range(len(Id[2000000:])))}
invdictId = {qidnum:qid for qid,qidnum in zip(IdTest[2000000:], range(len(Id[2000000:])))}
dictId = pd.Series(dictId)
fname = "dictIdTest_2000000-2013337.csv"
dictId.to_csv(fname)
invdictId = pd.Series(invdictId)
fname = "invdictIdTest_2000000-2013337.csv"
invdictId.to_csv(fname)
In [ ]:
lemmatizer = WordNetLemmatizer()
reader = pd.read_csv("Test.csv", chunksize=100000)
timeStart = time.time()
count=1
dictId = getDict("dictIdTest_0-200000.csv")
testQWordsBody_lil = lil_matrix( (len(dictId), len(dictWordsBody)) )
testQWordsTitle_lil = lil_matrix( (len(dictId), len(dictWordsTitle)) )
for chunk in reader:
for Id,title,body in zip(chunk['Id'],chunk['Title'],chunk['Body']):
titlewords = tokenizeWords(title)
settitle = set(titlewords)
iterWords = list(w for w in settitle if w in dictWordsTitle.keys())
for word in iterWords:
testQWordsTitle_lil[dictId[str(Id)],dictWordsTitle[word]] += titlewords.count(word)
bodywords = tokenizeWords(body)
setbody = set(bodywords)
iterWords = list(w for w in setbody if w in dictWordsBody.keys())
for word in iterWords:
testQWordsBody_lil[dictId[str(Id)],dictWordsBody[word]] += bodywords.count(word)
if count % 100000 == 0:
print("entry {0:d} finished".format(count))
print("time for 100000 loops: {0:.0f}s".format(time.time() - timeStart))
timeStart = time.time()
if count % 200000 == 0:
fname = "testWordsQTitle_" + str(count-200000) + "-" + str(count) + ".mtx"
mmwrite(fname, testQWordsTitle_lil)
fname = "testWordsQBody_" + str(count-200000) + "-" + str(count) + ".mtx"
mmwrite(fname, testQWordsBody_lil)
testQWordsBody_lil = lil_matrix( (200000, len(dictWordsBody)) )
testQWordsTitle_lil = lil_matrix( (200000, len(dictWordsTitle)) )
print("files saved")
fname = "dictIdTest_" + str(count) + "-" + str(count+200000) + ".csv"
dictId = getDict(fname)
count+=1
In [26]:
lemmatizer = WordNetLemmatizer()
reader = pd.read_csv("Test.csv", chunksize=100000)
timeStart = time.time()
count=1
dictId = getDict("dictIdTest_2000000-2013337.csv")
testQWordsBody_lil = lil_matrix( (len(dictId), len(dictWordsBody)) )
testQWordsTitle_lil = lil_matrix( (len(dictId), len(dictWordsTitle)) )
for chunk in reader:
for Id,title,body in zip(chunk['Id'],chunk['Title'],chunk['Body']):
if count > 2000000:
if count == 2000001:
print("start of evaluation")
titlewords = tokenizeWords(title)
settitle = set(titlewords)
iterWords = list(w for w in settitle if w in dictWordsTitle.keys())
for word in iterWords:
testQWordsTitle_lil[dictId[str(Id)],dictWordsTitle[word]] += titlewords.count(word)
bodywords = tokenizeWords(body)
setbody = set(bodywords)
iterWords = list(w for w in setbody if w in dictWordsBody.keys())
for word in iterWords:
testQWordsBody_lil[dictId[str(Id)],dictWordsBody[word]] += bodywords.count(word)
count+=1
fname = "testWordsQTitle_2000000-2013337.mtx"
mmwrite(fname, testQWordsTitle_lil)
fname = "testWordsQBody_2000000-2013337.mtx"
mmwrite(fname, testQWordsBody_lil)